Intro

url_pitching = https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=2018%7C&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&team=BOS&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0#results

url_batting = https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=2018%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&team=BOS&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0#results

This data comes from the mlb statcast dataset. The data can be found at the above urls by clicking on the icon above the graphs column in the table. Then load the data as shown below.

library(tidyverse)
library(stringr)
library(rvest)
library(ggplot2)
library(plotly)
library(tree)
library(randomForest)
library(png)
library(caret) 
library(rpart)

final_pitching_data_table <- read_csv("redsox_pitching_5132018.csv")
final_batting_data_table <- read_csv("redsox_batting_5132018.csv")

head(final_pitching_data_table)
## # A tibble: 6 x 87
##   pitch_type game_date  release_speed release_pos_x release_pos_z
##   <chr>      <date>     <chr>         <chr>         <chr>        
## 1 FF         2018-05-12 96.6          -3.1489       4.8075       
## 2 KC         2018-05-12 86.8          -3.0017       4.7448       
## 3 FF         2018-05-12 96.3          -3.0122       4.8443       
## 4 FF         2018-05-12 96.6          -2.8003       4.8929       
## 5 FF         2018-05-12 97.1          -3.0436       4.8279       
## 6 KC         2018-05-12 84.3          -2.9241       4.8362       
## # ... with 82 more variables: player_name <chr>, batter <int>,
## #   pitcher <int>, events <chr>, description <chr>, spin_dir <chr>,
## #   spin_rate_deprecated <chr>, break_angle_deprecated <chr>,
## #   break_length_deprecated <chr>, zone <chr>, des <chr>, game_type <chr>,
## #   stand <chr>, p_throws <chr>, home_team <chr>, away_team <chr>,
## #   type <chr>, hit_location <chr>, bb_type <chr>, balls <int>,
## #   strikes <int>, game_year <int>, pfx_x <chr>, pfx_z <chr>,
## #   plate_x <chr>, plate_z <chr>, on_3b <chr>, on_2b <chr>, on_1b <chr>,
## #   outs_when_up <int>, inning <int>, inning_topbot <chr>, hc_x <chr>,
## #   hc_y <chr>, tfs_deprecated <chr>, tfs_zulu_deprecated <chr>,
## #   pos2_person_id <chr>, umpire <chr>, sv_id <chr>, vx0 <chr>, vy0 <chr>,
## #   vz0 <chr>, ax <chr>, ay <chr>, az <chr>, sz_top <chr>, sz_bot <chr>,
## #   hit_distance_sc <chr>, launch_speed <chr>, launch_angle <chr>,
## #   effective_speed <chr>, release_spin_rate <chr>,
## #   release_extension <chr>, game_pk <int>, pos1_person_id <chr>,
## #   pos2_person_id_1 <chr>, pos3_person_id <chr>, pos4_person_id <chr>,
## #   pos5_person_id <chr>, pos6_person_id <chr>, pos7_person_id <chr>,
## #   pos8_person_id <chr>, pos9_person_id <chr>, release_pos_y <chr>,
## #   estimated_ba_using_speedangle <chr>,
## #   estimated_woba_using_speedangle <chr>, woba_value <chr>,
## #   woba_denom <chr>, babip_value <chr>, iso_value <chr>,
## #   launch_speed_angle <chr>, at_bat_number <int>, pitch_number <int>,
## #   pitch_name <chr>, home_score <int>, away_score <int>, bat_score <int>,
## #   fld_score <int>, post_away_score <int>, post_home_score <int>,
## #   post_bat_score <int>, post_fld_score <int>
head(final_batting_data_table)
## # A tibble: 6 x 87
##   pitch_type game_date  release_speed release_pos_x release_pos_z
##   <chr>      <date>     <chr>         <chr>         <chr>        
## 1 FF         2018-05-12 96.2          -2.1301       5.8072       
## 2 FC         2018-05-12 88.8          -2.1697       5.8660       
## 3 FC         2018-05-12 88.7          -2.2286       5.8761       
## 4 FC         2018-05-12 88.4          -2.3845       5.7680       
## 5 FF         2018-05-12 95.4          -2.2892       5.7659       
## 6 FF         2018-05-12 94.9          -2.2536       5.8817       
## # ... with 82 more variables: player_name <chr>, batter <int>,
## #   pitcher <int>, events <chr>, description <chr>, spin_dir <chr>,
## #   spin_rate_deprecated <chr>, break_angle_deprecated <chr>,
## #   break_length_deprecated <chr>, zone <chr>, des <chr>, game_type <chr>,
## #   stand <chr>, p_throws <chr>, home_team <chr>, away_team <chr>,
## #   type <chr>, hit_location <chr>, bb_type <chr>, balls <int>,
## #   strikes <int>, game_year <int>, pfx_x <chr>, pfx_z <chr>,
## #   plate_x <chr>, plate_z <chr>, on_3b <chr>, on_2b <chr>, on_1b <chr>,
## #   outs_when_up <int>, inning <int>, inning_topbot <chr>, hc_x <chr>,
## #   hc_y <chr>, tfs_deprecated <chr>, tfs_zulu_deprecated <chr>,
## #   pos2_person_id <chr>, umpire <chr>, sv_id <chr>, vx0 <chr>, vy0 <chr>,
## #   vz0 <chr>, ax <chr>, ay <chr>, az <chr>, sz_top <chr>, sz_bot <chr>,
## #   hit_distance_sc <chr>, launch_speed <chr>, launch_angle <chr>,
## #   effective_speed <chr>, release_spin_rate <chr>,
## #   release_extension <chr>, game_pk <int>, pos1_person_id <chr>,
## #   pos2_person_id_1 <chr>, pos3_person_id <chr>, pos4_person_id <chr>,
## #   pos5_person_id <chr>, pos6_person_id <chr>, pos7_person_id <chr>,
## #   pos8_person_id <chr>, pos9_person_id <chr>, release_pos_y <chr>,
## #   estimated_ba_using_speedangle <chr>,
## #   estimated_woba_using_speedangle <chr>, woba_value <chr>,
## #   woba_denom <chr>, babip_value <chr>, iso_value <chr>,
## #   launch_speed_angle <chr>, at_bat_number <int>, pitch_number <int>,
## #   pitch_name <chr>, home_score <int>, away_score <int>, bat_score <int>,
## #   fld_score <int>, post_away_score <int>, post_home_score <int>,
## #   post_bat_score <int>, post_fld_score <int>

Create ID Tables

If we look at the pitching and batting data we will notice that there are fields with ids that are not translated to names. So we will use the batting and pitching data to generate ids for each person. Since the player_name is the name of the batter/pitcher in their respective data sets we should be able to get all the ids by extracting the player_name with the batter field for the batting table and the player_name field with the pitcher field for the pitching data.

batter_id_table <- final_batting_data_table %>%
  group_by(player_name, batter) %>%
  select(player_name, batter) %>%
  distinct

pitcher_id_table <- final_pitching_data_table %>%
  group_by(player_name, pitcher) %>%
  select(player_name, pitcher) %>%
  distinct

head(batter_id_table)
## # A tibble: 6 x 2
## # Groups:   player_name, batter [6]
##   player_name        batter
##   <chr>               <int>
## 1 Hanley Ramirez     434670
## 2 Andrew Benintendi  643217
## 3 Mookie Betts       605141
## 4 Christian Vazquez  543877
## 5 Jackie Bradley Jr. 598265
## 6 Eduardo Nunez      456488
head(pitcher_id_table)
## # A tibble: 6 x 2
## # Groups:   player_name, pitcher [6]
##   player_name      pitcher
##   <chr>              <int>
## 1 Craig Kimbrel     518886
## 2 Joe Kelly         523260
## 3 Hector Velazquez  584171
## 4 Carson Smith      605476
## 5 David Price       456034
## 6 Brian Johnson     598271

Get Mookie Betts Data

Mookie Betts is having a standout season. I would like to focus on data regarding Mookie’s at bats.

mookie_data <- final_batting_data_table %>%
  filter(player_name=="Mookie Betts")

head(mookie_data)
## # A tibble: 6 x 87
##   pitch_type game_date  release_speed release_pos_x release_pos_z
##   <chr>      <date>     <chr>         <chr>         <chr>        
## 1 FF         2018-05-12 94.9          -2.2536       5.8817       
## 2 FF         2018-05-12 95.8          -2.2788       5.8093       
## 3 FT         2018-05-12 97.3          -2.4584       5.7709       
## 4 FF         2018-05-12 96.1          -2.2799       5.7348       
## 5 FF         2018-05-12 95.9          -1.9824       5.7779       
## 6 FF         2018-05-12 95.7          -2.3119       5.7480       
## # ... with 82 more variables: player_name <chr>, batter <int>,
## #   pitcher <int>, events <chr>, description <chr>, spin_dir <chr>,
## #   spin_rate_deprecated <chr>, break_angle_deprecated <chr>,
## #   break_length_deprecated <chr>, zone <chr>, des <chr>, game_type <chr>,
## #   stand <chr>, p_throws <chr>, home_team <chr>, away_team <chr>,
## #   type <chr>, hit_location <chr>, bb_type <chr>, balls <int>,
## #   strikes <int>, game_year <int>, pfx_x <chr>, pfx_z <chr>,
## #   plate_x <chr>, plate_z <chr>, on_3b <chr>, on_2b <chr>, on_1b <chr>,
## #   outs_when_up <int>, inning <int>, inning_topbot <chr>, hc_x <chr>,
## #   hc_y <chr>, tfs_deprecated <chr>, tfs_zulu_deprecated <chr>,
## #   pos2_person_id <chr>, umpire <chr>, sv_id <chr>, vx0 <chr>, vy0 <chr>,
## #   vz0 <chr>, ax <chr>, ay <chr>, az <chr>, sz_top <chr>, sz_bot <chr>,
## #   hit_distance_sc <chr>, launch_speed <chr>, launch_angle <chr>,
## #   effective_speed <chr>, release_spin_rate <chr>,
## #   release_extension <chr>, game_pk <int>, pos1_person_id <chr>,
## #   pos2_person_id_1 <chr>, pos3_person_id <chr>, pos4_person_id <chr>,
## #   pos5_person_id <chr>, pos6_person_id <chr>, pos7_person_id <chr>,
## #   pos8_person_id <chr>, pos9_person_id <chr>, release_pos_y <chr>,
## #   estimated_ba_using_speedangle <chr>,
## #   estimated_woba_using_speedangle <chr>, woba_value <chr>,
## #   woba_denom <chr>, babip_value <chr>, iso_value <chr>,
## #   launch_speed_angle <chr>, at_bat_number <int>, pitch_number <int>,
## #   pitch_name <chr>, home_score <int>, away_score <int>, bat_score <int>,
## #   fld_score <int>, post_away_score <int>, post_home_score <int>,
## #   post_bat_score <int>, post_fld_score <int>

Events

mookie_data <- mookie_data %>%
  mutate(rbi=post_bat_score-bat_score)
  
mookie_data %>%
  group_by(events) %>%
  summarize(count=n())
## # A tibble: 15 x 2
##    events                    count
##    <chr>                     <int>
##  1 double                       15
##  2 field_error                   2
##  3 field_out                    61
##  4 force_out                     3
##  5 grounded_into_double_play     2
##  6 hit_by_pitch                  3
##  7 home_run                     13
##  8 intent_walk                   2
##  9 null                        463
## 10 other_out                     1
## 11 sac_fly                       2
## 12 single                       18
## 13 strikeout                    17
## 14 triple                        1
## 15 walk                         15

Lets look at the at bats but grouped based on pitch type.

mookie_data %>%
  group_by(events, pitch_name) %>%
  summarize(count=n())
## # A tibble: 64 x 3
## # Groups:   events [?]
##    events      pitch_name      count
##    <chr>       <chr>           <int>
##  1 double      2-Seam Fastball     1
##  2 double      4-Seam Fastball     6
##  3 double      Changeup            3
##  4 double      Curveball           1
##  5 double      Slider              4
##  6 field_error Changeup            1
##  7 field_error Slider              1
##  8 field_out   2-Seam Fastball     8
##  9 field_out   4-Seam Fastball    27
## 10 field_out   Changeup           10
## # ... with 54 more rows

Some raw data links to column names https://fastballs.wordpress.com/category/pitchfx-glossary/

pfx_x: the horizontal movement, in inches, of the pitch between the release point and home plate, as compared to a theoretical pitch thrown at the same speed with no spin-induced movement. This parameter is measured at y=40 feet regardless of the y0 value.

pfx_z: the vertical movement, in inches, of the pitch between the release point and home plate, as compared to a theoretical pitch thrown at the same speed with no spin-induced movement. This parameter is measured at y=40 feet regardless of the y0 value.

plate_x: the left/right distance, in feet, of the pitch from the middle of the plate as it crossed home plate. The PITCHf/x coordinate system is oriented to the catcher’s/umpire’s perspective, with distances to the right being positive and to the left being negative.

plate_z: the height of the pitch in feet as it crossed the front of home plate.

mookie_bat_data <- mookie_data %>%
  select(events, pitch_name, release_speed, pitcher, bb_type, pfx_x, pfx_z, plate_x, plate_z, launch_angle, launch_speed, hit_distance_sc, sz_top, sz_bot)
mookie_bat_data %>%
  filter(events=="home_run") %>%
  head()
## # A tibble: 6 x 14
##   events  pitch_name   release_speed pitcher bb_type   pfx_x pfx_z plate_x
##   <chr>   <chr>        <chr>           <int> <chr>     <chr> <chr> <chr>  
## 1 home_r~ 2-Seam Fast~ 86.0           112526 fly_ball  -1.2~ 0.76~ -0.2037
## 2 home_r~ 4-Seam Fast~ 92.9           501985 line_dri~ 0.85~ 1.60~ 0.3792 
## 3 home_r~ 2-Seam Fast~ 92.9           518633 fly_ball  0.90~ 1.30~ 0.7083 
## 4 home_r~ Slider       86.8           518633 fly_ball  -0.2~ 0.13~ -0.0138
## 5 home_r~ Changeup     83.9           518633 fly_ball  1.12~ 1.21~ -0.3759
## 6 home_r~ 4-Seam Fast~ 91.6           592130 fly_ball  -1.1~ 1.32~ 0.6080 
## # ... with 6 more variables: plate_z <chr>, launch_angle <chr>,
## #   launch_speed <chr>, hit_distance_sc <chr>, sz_top <chr>, sz_bot <chr>

Before we can analyze and graph this we need to clean it up.

mookie_bat_data <- mookie_bat_data %>%
  type_convert(cols(release_speed=col_double(), pfx_x=col_double(), pfx_z=col_double(), plate_x=col_double(), plate_z=col_double(), launch_angle=col_double(), launch_speed=col_double(), hit_distance_sc=col_integer(), sz_top=col_double(), sz_bot=col_double()))

mookie_bat_data$pitch_name <- as.factor(mookie_bat_data$pitch_name)
mookie_bat_data$events <- as.factor(mookie_bat_data$events)
mookie_bat_data$bb_type <- as.factor(mookie_bat_data$bb_type)
head(mookie_bat_data)
## # A tibble: 6 x 14
##   events pitch_name    release_speed pitcher bb_type   pfx_x pfx_z plate_x
##   <fct>  <fct>                 <dbl>   <int> <fct>     <dbl> <dbl>   <dbl>
## 1 double 4-Seam Fastb~          94.9  572193 line_dr~ -0.546  1.23  -0.684
## 2 null   4-Seam Fastb~          95.8  572193 null     -0.669  1.23   0.837
## 3 null   2-Seam Fastb~          97.3  572193 null     -1.31   1.14   1.24 
## 4 null   4-Seam Fastb~          96.1  572193 null     -0.361  1.47  -0.718
## 5 null   4-Seam Fastb~          95.9  572193 null     -0.432  1.52   1.73 
## 6 null   4-Seam Fastb~          95.7  572193 null     -0.711  1.30   0.263
## # ... with 6 more variables: plate_z <dbl>, launch_angle <dbl>,
## #   launch_speed <dbl>, hit_distance_sc <int>, sz_top <dbl>, sz_bot <dbl>
strike_zone_top <- mean(mookie_bat_data$sz_top, na.rm=TRUE)
strike_zone_bot <- mean(mookie_bat_data$sz_bot, na.rm=TRUE)
strike_zone_front <- 17/2.0/12


graphed <- ggplot() + geom_rect(aes(xmin=-strike_zone_front, xmax=strike_zone_front, ymin=strike_zone_bot, ymax=strike_zone_top), color="black", fill=NA) + geom_point(data=mookie_bat_data, aes(x=plate_x, y=plate_z, text=sprintf("events: %s<br>Pitch Name: %s<br>Release Speed: %s<br>Launch Angle: %s", events, pitch_name, release_speed, launch_angle))) +
  scale_x_continuous(name="Catchers View | left/right distance in feet of center of plate", limits=c(-4, 4), breaks=seq(from=-4, to=4, by=2)) + scale_y_continuous(name="Height above plate In Feet", limits=c(-2, 6), breaks=seq(from=-2, to=6, by=1))

ggplotly(graphed) 

This is rather ugly. Lets do pitches that result in action and color code it by the event that happens.

action <- mookie_bat_data %>%
  filter(events!="null")

graphed <- ggplot() + geom_rect(aes(xmin=-strike_zone_front, xmax=strike_zone_front, ymin=strike_zone_bot, ymax=strike_zone_top), color="black", fill=NA) + geom_point(data=action, aes(x=plate_x, y=plate_z, text=sprintf("events: %s<br>Pitch Name: %s<br>Release Speed: %s<br>Launch Angle: %s", events, pitch_name, release_speed, launch_angle), color=events)) +
  scale_x_continuous(name="Catchers View | left/right distance in feet of center of plate", limits=c(-4, 4), breaks=seq(from=-4, to=4, by=2)) + scale_y_continuous(name="Height above plate In Feet", limits=c(-2, 6), breaks=seq(from=-2, to=6, by=1))

ggplotly(graphed) 

Mookie leads the league in HRS. Let’s filter by home runs and then graph the hrs colored by pitch.

action <- mookie_bat_data %>%
  filter(events == "home_run")

graphed <- ggplot() + geom_rect(aes(xmin=-strike_zone_front, xmax=strike_zone_front, ymin=strike_zone_bot, ymax=strike_zone_top), color="black", fill=NA) + geom_point(data=action, aes(x=plate_x, y=plate_z, text=sprintf("events: %s<br>Pitch Name: %s<br>Release Speed: %s<br>Launch Angle: %s", events, pitch_name, release_speed, launch_angle), color=pitch_name)) +
  scale_x_continuous(name="Catchers View | left/right distance in feet of center of plate", limits=c(-4, 4), breaks=seq(from=-4, to=4, by=2)) + scale_y_continuous(name="Height above plate In Feet", limits=c(-2, 6), breaks=seq(from=-2, to=6, by=1))

ggplotly(graphed) 

Lets try to predict the outcome of a pitch.

set.seed(1234)
mookie_bat_data_converted <- mookie_bat_data %>%
  filter(events!="null") %>%
  filter(events!="intent_walk") %>%
  filter(!is.na(plate_x)) %>%
  select(events, pitch_name, release_speed, plate_x, plate_z)




mookie_bat_data_converted 
## # A tibble: 151 x 5
##    events      pitch_name      release_speed plate_x plate_z
##    <fct>       <fct>                   <dbl>   <dbl>   <dbl>
##  1 double      4-Seam Fastball          94.9 -0.684     3.03
##  2 single      2-Seam Fastball          92.7 -0.661     2.68
##  3 field_out   Changeup                 74.9 -0.0167    3.39
##  4 double      Changeup                 77.8  0.0608    1.58
##  5 field_out   4-Seam Fastball          89.8  0.404     2.85
##  6 strikeout   Slider                   84.3  1.34      1.21
##  7 field_error Slider                   86.0  0.646     3.05
##  8 field_out   Cutter                   85.5  0.471     2.67
##  9 field_out   2-Seam Fastball          93.8 -0.765     2.74
## 10 strikeout   Curveball                78.8  0.990     1.92
## # ... with 141 more rows
mookie_tree <- tree(events~pitch_name+plate_x+plate_z+release_speed, data=mookie_bat_data_converted)

# We will save the plot as a png to deal with overlapping graphics
png("image.png", width = 1200, height = 700)
plot(mookie_tree)
text(mookie_tree, pretty=1, cex=1.1)
dev.off()
## png 
##   2
img <- readPNG('image.png')
grid::grid.raster(img)

This tree shows us the what the likelihood outcome of a pitch is based on location, type, and speed.

Lets try and use a Random Forest

mookie_bat_data_converted$pitch_name = as.numeric(mookie_bat_data_converted$pitch_name)

mookie_bat_data_converted <- droplevels(mookie_bat_data_converted)

train_set <- mookie_bat_data_converted %>% 
  group_by(events) %>%
  sample_frac(.6) %>%
  ungroup()

test_set <- mookie_bat_data_converted %>%
  anti_join(train_set)

train_set
## # A tibble: 91 x 5
##    events      pitch_name release_speed  plate_x plate_z
##    <fct>            <dbl>         <dbl>    <dbl>   <dbl>
##  1 double            3.00          77.8  0.0608     1.58
##  2 double            2.00          90.4 -0.630      1.69
##  3 double            4.00          77.3 -0.00490    2.43
##  4 double            2.00          92.9 -0.208      1.72
##  5 double            8.00          84.9  0.767      1.64
##  6 double            3.00          88.4  0.686      2.01
##  7 double            2.00          94.9 -0.684      3.03
##  8 double            8.00          82.1 -0.0844     2.40
##  9 double            3.00          82.3 -0.0869     1.05
## 10 field_error       3.00          83.0  0.0677     1.62
## # ... with 81 more rows
test_set
## # A tibble: 60 x 5
##    events      pitch_name release_speed plate_x plate_z
##    <fct>            <dbl>         <dbl>   <dbl>   <dbl>
##  1 field_out         3.00          74.9 -0.0167    3.39
##  2 field_error       8.00          86.0  0.646     3.05
##  3 field_out         5.00          85.5  0.471     2.67
##  4 field_out         1.00          93.8 -0.765     2.74
##  5 walk              2.00          95.1 -0.0117    3.85
##  6 field_out         2.00          98.0  0.260     2.08
##  7 single            3.00          83.1  0.609     2.16
##  8 single            2.00          91.5 -0.0774    2.62
##  9 double            8.00          80.9  0.701     2.29
## 10 field_out         2.00          96.7 -0.216     3.23
## # ... with 50 more rows
mookie_rf <- randomForest(events~pitch_name+plate_x+plate_z, data=train_set)


test_predictions <- predict(mookie_rf, newdata=test_set)
cm <- table(pred=test_predictions, observed=test_set$events)

confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##                            observed
## pred                        double field_error field_out force_out
##   double                         0           0         0         0
##   field_error                    0           0         0         0
##   field_out                      5           0        22         1
##   force_out                      0           0         0         0
##   grounded_into_double_play      0           0         0         0
##   hit_by_pitch                   0           0         0         0
##   home_run                       0           0         0         0
##   other_out                      0           0         0         0
##   sac_fly                        0           0         0         0
##   single                         0           0         1         0
##   strikeout                      1           1         0         0
##   triple                         0           0         0         0
##   walk                           0           0         1         0
##                            observed
## pred                        grounded_into_double_play hit_by_pitch
##   double                                            0            0
##   field_error                                       0            0
##   field_out                                         1            0
##   force_out                                         0            0
##   grounded_into_double_play                         0            0
##   hit_by_pitch                                      0            0
##   home_run                                          0            0
##   other_out                                         0            0
##   sac_fly                                           0            0
##   single                                            0            0
##   strikeout                                         0            0
##   triple                                            0            0
##   walk                                              0            1
##                            observed
## pred                        home_run other_out sac_fly single strikeout
##   double                           1         0       0      1         0
##   field_error                      0         0       0      0         0
##   field_out                        3         0       1      6         4
##   force_out                        0         0       0      0         0
##   grounded_into_double_play        0         0       0      0         0
##   hit_by_pitch                     0         0       0      0         0
##   home_run                         0         0       0      0         0
##   other_out                        0         0       0      0         0
##   sac_fly                          0         0       0      0         0
##   single                           1         0       0      0         0
##   strikeout                        0         0       0      0         3
##   triple                           0         0       0      0         0
##   walk                             0         0       0      0         0
##                            observed
## pred                        triple walk
##   double                         0    1
##   field_error                    0    0
##   field_out                      0    2
##   force_out                      0    0
##   grounded_into_double_play      0    0
##   hit_by_pitch                   0    0
##   home_run                       0    0
##   other_out                      0    0
##   sac_fly                        0    0
##   single                         0    0
##   strikeout                      0    0
##   triple                         0    0
##   walk                           0    3
## 
## Overall Statistics
##                                        
##                Accuracy : 0.4667       
##                  95% CI : (0.3367, 0.6)
##     No Information Rate : 0.4          
##     P-Value [Acc > NIR] : 0.1779       
##                                        
##                   Kappa : 0.2076       
##  Mcnemar's Test P-Value : NA           
## 
## Statistics by Class:
## 
##                      Class: double Class: field_error Class: field_out
## Sensitivity                 0.0000            0.00000           0.9167
## Specificity                 0.9444            1.00000           0.3611
## Pos Pred Value              0.0000                NaN           0.4889
## Neg Pred Value              0.8947            0.98333           0.8667
## Prevalence                  0.1000            0.01667           0.4000
## Detection Rate              0.0000            0.00000           0.3667
## Detection Prevalence        0.0500            0.00000           0.7500
## Balanced Accuracy           0.4722            0.50000           0.6389
##                      Class: force_out Class: grounded_into_double_play
## Sensitivity                   0.00000                          0.00000
## Specificity                   1.00000                          1.00000
## Pos Pred Value                    NaN                              NaN
## Neg Pred Value                0.98333                          0.98333
## Prevalence                    0.01667                          0.01667
## Detection Rate                0.00000                          0.00000
## Detection Prevalence          0.00000                          0.00000
## Balanced Accuracy             0.50000                          0.50000
##                      Class: hit_by_pitch Class: home_run Class: other_out
## Sensitivity                      0.00000         0.00000               NA
## Specificity                      1.00000         1.00000                1
## Pos Pred Value                       NaN             NaN               NA
## Neg Pred Value                   0.98333         0.91667               NA
## Prevalence                       0.01667         0.08333                0
## Detection Rate                   0.00000         0.00000                0
## Detection Prevalence             0.00000         0.00000                0
## Balanced Accuracy                0.50000         0.50000               NA
##                      Class: sac_fly Class: single Class: strikeout
## Sensitivity                 0.00000       0.00000          0.42857
## Specificity                 1.00000       0.96226          0.96226
## Pos Pred Value                  NaN       0.00000          0.60000
## Neg Pred Value              0.98333       0.87931          0.92727
## Prevalence                  0.01667       0.11667          0.11667
## Detection Rate              0.00000       0.00000          0.05000
## Detection Prevalence        0.00000       0.03333          0.08333
## Balanced Accuracy           0.50000       0.48113          0.69542
##                      Class: triple Class: walk
## Sensitivity                     NA     0.50000
## Specificity                      1     0.96296
## Pos Pred Value                  NA     0.60000
## Neg Pred Value                  NA     0.94545
## Prevalence                       0     0.10000
## Detection Rate                   0     0.05000
## Detection Prevalence             0     0.08333
## Balanced Accuracy               NA     0.73148

46% accurate is really bad. So there might not be enough data on mookie to predict properly. What if we try on all the red sox batters with as many predictors as we can get. We forgot that a 3-0 count is never gonna result in a strikeout whereas a 0-2 will never get a walk, and if no one is one base there can’t be a doulbe play. So we add as many predictors as we can.

# We filter out na's and select our columns.
final_batting_data_table_converted <- final_batting_data_table %>%
  mutate(release_speed=replace(release_speed, release_speed=="null", 0))%>%
   mutate(pfx_x=replace(pfx_x, pfx_x=="null", 0))%>%
   mutate(pfx_z=replace(pfx_z, pfx_z=="null", 0))%>%
   mutate(plate_x=replace(plate_x, plate_x=="null", 0))%>%
   mutate(plate_z=replace(plate_z, plate_z=="null", 0))%>%
   mutate(on_1b=replace(on_1b, on_1b=="null", 0))%>%
   mutate(on_2b=replace(on_2b, on_2b=="null", 0))%>%
   mutate(on_3b=replace(on_3b, on_3b=="null", 0))%>%
   mutate(hit_location=replace(hit_location, hit_location=="null", 0))%>%
  mutate(hit_distance_sc=replace(hit_distance_sc, hit_distance_sc=="null", 0))%>%
  mutate(launch_angle=replace(launch_angle, launch_angle=="null", 0))%>%
  mutate(launch_speed=replace(launch_speed, launch_speed=="null", 0))%>%
  
  type_convert(cols(release_speed=col_double(), pfx_x=col_double(), pfx_z=col_double(), plate_x=col_double(), plate_z=col_double(), on_1b=col_double(), on_2b=col_double(), on_3b=col_double(), hit_distance_sc=col_double(), launch_angle=col_double(), launch_speed=col_double(), hit_location=col_double())) %>%
  filter(events!="intent_walk") %>%
  filter(events!="null") %>%
  filter(!is.na(plate_x)) %>%
  filter(!is.na(plate_z)) %>%
  filter(!is.na(pitch_name)) %>%
  filter(!is.na(release_speed)) %>%
  mutate(rbi=post_bat_score-bat_score) %>%
  select(events, pitch_name, release_speed, plate_x, plate_z, pfx_x, pfx_z, batter, pitcher, balls, strikes, outs_when_up, on_1b, on_2b, on_3b, hit_location, pfx_x, pfx_z, hit_distance_sc, launch_speed, launch_angle, pitch_number, rbi)
final_batting_data_table_converted$pitch_name <- as.factor(final_batting_data_table_converted$pitch_name)
final_batting_data_table_converted$events <- as.factor(final_batting_data_table_converted$events)

final_batting_data_table_converted$on_1b[is.na(final_batting_data_table_converted$on_1b)] <- 0
final_batting_data_table_converted$on_2b[is.na(final_batting_data_table_converted$on_2b)] <- 0
final_batting_data_table_converted$on_3b[is.na(final_batting_data_table_converted$on_3b)] <- 0

# We convert the pitch name to numeric
final_batting_data_table_converted$pitch_name = as.numeric(final_batting_data_table_converted$pitch_name)

# We drop unused levels
final_batting_data_table_converted <- droplevels(final_batting_data_table_converted)

# Use 60/40 split for train data / test data
train_set <- final_batting_data_table_converted %>% 
  group_by(events) %>%
  sample_frac(.8) %>%
  ungroup()

test_set <- final_batting_data_table_converted %>%
  anti_join(train_set)
## Joining, by = c("events", "pitch_name", "release_speed", "plate_x", "plate_z", "pfx_x", "pfx_z", "batter", "pitcher", "balls", "strikes", "outs_when_up", "on_1b", "on_2b", "on_3b", "hit_location", "hit_distance_sc", "launch_speed", "launch_angle", "pitch_number", "rbi")
train_set
## # A tibble: 1,238 x 21
##    events    pitch_name release_speed plate_x plate_z  pfx_x  pfx_z batter
##    <fct>          <dbl>         <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <int>
##  1 caught_s~       2.00          93.1 -0.364     1.73 -0.716  1.72  643217
##  2 caught_s~       8.00          93.9  1.09      2.37 -0.926  1.22  519048
##  3 double          2.00          91.2 -0.648     2.89 -0.769  1.48  543877
##  4 double          1.00          92.7 -0.508     1.46 -1.36   0.801 643217
##  5 double          1.00          92.7  0.0661    1.29 -1.36   0.924 646240
##  6 double          9.00          81.6  0.513     2.13  0.830 -0.319 646240
##  7 double          2.00          90.9  0.515     3.12 -0.764  1.76  519048
##  8 double          5.00          88.5 -0.162     1.79  0.196  0.565 593428
##  9 double          4.00          81.3 -0.611     3.06 -0.562 -0.928 543877
## 10 double          3.00          81.1  0.249     2.40  1.64   0.392 593428
## # ... with 1,228 more rows, and 13 more variables: pitcher <int>,
## #   balls <int>, strikes <int>, outs_when_up <int>, on_1b <dbl>,
## #   on_2b <dbl>, on_3b <dbl>, hit_location <dbl>, hit_distance_sc <dbl>,
## #   launch_speed <dbl>, launch_angle <dbl>, pitch_number <int>, rbi <int>
test_set
## # A tibble: 309 x 21
##    events    pitch_name release_speed plate_x plate_z   pfx_x pfx_z batter
##    <fct>          <dbl>         <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <int>
##  1 field_out       3.00          75.6 -0.880    2.11  -1.21   1.59  643217
##  2 field_out       2.00          89.2  1.32     3.13  -0.258  1.92  456488
##  3 strikeout       3.00          75.9 -0.884    1.60  -1.02   1.54  646240
##  4 field_out       2.00          89.8  0.404    2.85   0.0873 1.87  605141
##  5 field_out       9.00          83.9  1.07     2.45   0.341  0.624 643217
##  6 strikeout       2.00          91.7 -0.929    2.49  -0.331  1.61  571788
##  7 strikeout       2.00          93.1  0.0445   3.22  -0.563  1.51  643217
##  8 field_out       5.00          85.5  0.471    2.67   0.189  0.493 605141
##  9 strikeout       3.00          82.9 -0.506    0.507 -1.51   0.526 506702
## 10 field_out       3.00          86.6 -0.723    1.41  -1.63   0.368 593428
## # ... with 299 more rows, and 13 more variables: pitcher <int>,
## #   balls <int>, strikes <int>, outs_when_up <int>, on_1b <dbl>,
## #   on_2b <dbl>, on_3b <dbl>, hit_location <dbl>, hit_distance_sc <dbl>,
## #   launch_speed <dbl>, launch_angle <dbl>, pitch_number <int>, rbi <int>
final_rf <- randomForest(events~batter+pitcher+pitch_name+plate_x+plate_z+release_speed+balls+strikes+outs_when_up+on_1b+on_2b+on_3b+hit_location+pfx_x+pfx_z+hit_distance_sc+launch_speed+launch_angle+pitch_number+rbi, data=train_set)

test_predictions <- predict(final_rf, newdata=test_set)
cm <- table(pred=test_predictions, observed=test_set$events)

confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##                            observed
## pred                        caught_stealing_2b double double_play
##   caught_stealing_2b                         0      0           0
##   double                                     0      9           0
##   double_play                                0      0           0
##   field_error                                0      0           0
##   field_out                                  0      1           1
##   fielders_choice                            0      0           0
##   fielders_choice_out                        0      0           0
##   force_out                                  0      0           0
##   grounded_into_double_play                  0      0           0
##   hit_by_pitch                               0      0           0
##   home_run                                   0      2           0
##   other_out                                  0      0           0
##   pickoff_2b                                 0      0           0
##   sac_bunt                                   0      0           0
##   sac_fly                                    0      0           0
##   single                                     0      7           0
##   strikeout                                  0      0           0
##   strikeout_double_play                      0      0           0
##   triple                                     0      0           0
##   walk                                       0      0           0
##                            observed
## pred                        field_error field_out fielders_choice
##   caught_stealing_2b                  0         0               0
##   double                              0         0               0
##   double_play                         0         0               0
##   field_error                         0         0               0
##   field_out                           4       126               0
##   fielders_choice                     0         0               0
##   fielders_choice_out                 0         0               0
##   force_out                           0         0               0
##   grounded_into_double_play           0         0               0
##   hit_by_pitch                        0         0               0
##   home_run                            0         0               0
##   other_out                           0         0               0
##   pickoff_2b                          0         0               0
##   sac_bunt                            0         0               0
##   sac_fly                             0         0               0
##   single                              0         0               0
##   strikeout                           0         0               0
##   strikeout_double_play               0         0               0
##   triple                              0         0               0
##   walk                                0         0               0
##                            observed
## pred                        fielders_choice_out force_out
##   caught_stealing_2b                          0         0
##   double                                      0         0
##   double_play                                 0         0
##   field_error                                 0         0
##   field_out                                   1         6
##   fielders_choice                             0         0
##   fielders_choice_out                         0         0
##   force_out                                   0         0
##   grounded_into_double_play                   0         0
##   hit_by_pitch                                0         0
##   home_run                                    0         0
##   other_out                                   0         0
##   pickoff_2b                                  0         0
##   sac_bunt                                    0         0
##   sac_fly                                     0         0
##   single                                      0         0
##   strikeout                                   0         0
##   strikeout_double_play                       0         0
##   triple                                      0         0
##   walk                                        0         0
##                            observed
## pred                        grounded_into_double_play hit_by_pitch
##   caught_stealing_2b                                0            0
##   double                                            0            0
##   double_play                                       0            0
##   field_error                                       0            0
##   field_out                                         4            0
##   fielders_choice                                   0            0
##   fielders_choice_out                               0            0
##   force_out                                         0            0
##   grounded_into_double_play                         1            0
##   hit_by_pitch                                      0            3
##   home_run                                          0            0
##   other_out                                         0            0
##   pickoff_2b                                        0            0
##   sac_bunt                                          0            0
##   sac_fly                                           0            0
##   single                                            0            0
##   strikeout                                         0            0
##   strikeout_double_play                             0            0
##   triple                                            0            0
##   walk                                              0            0
##                            observed
## pred                        home_run other_out pickoff_2b sac_bunt sac_fly
##   caught_stealing_2b               0         0          0        0       0
##   double                           2         0          0        0       0
##   double_play                      0         0          0        0       0
##   field_error                      0         0          0        0       0
##   field_out                        0         0          0        0       3
##   fielders_choice                  0         0          0        0       0
##   fielders_choice_out              0         0          0        0       0
##   force_out                        0         0          0        0       0
##   grounded_into_double_play        0         0          0        0       0
##   hit_by_pitch                     0         0          0        0       0
##   home_run                         9         0          0        0       0
##   other_out                        0         0          0        0       0
##   pickoff_2b                       0         0          0        0       0
##   sac_bunt                         0         0          0        0       0
##   sac_fly                          0         0          0        0       0
##   single                           0         0          0        0       0
##   strikeout                        0         0          0        0       0
##   strikeout_double_play            0         0          0        0       0
##   triple                           0         0          0        0       0
##   walk                             0         0          0        0       0
##                            observed
## pred                        single strikeout strikeout_double_play triple
##   caught_stealing_2b             0         0                     0      0
##   double                         4         0                     0      0
##   double_play                    0         0                     0      0
##   field_error                    0         0                     0      0
##   field_out                      1         1                     0      0
##   fielders_choice                0         0                     0      0
##   fielders_choice_out            0         0                     0      0
##   force_out                      0         0                     0      0
##   grounded_into_double_play      0         0                     0      0
##   hit_by_pitch                   0         0                     0      0
##   home_run                       0         0                     0      0
##   other_out                      0         0                     0      0
##   pickoff_2b                     0         0                     0      0
##   sac_bunt                       0         0                     0      0
##   sac_fly                        0         0                     0      0
##   single                        39         0                     0      1
##   strikeout                      0        62                     0      0
##   strikeout_double_play          0         0                     0      0
##   triple                         0         0                     0      0
##   walk                           0         0                     0      0
##                            observed
## pred                        walk
##   caught_stealing_2b           0
##   double                       0
##   double_play                  0
##   field_error                  0
##   field_out                    0
##   fielders_choice              0
##   fielders_choice_out          0
##   force_out                    0
##   grounded_into_double_play    0
##   hit_by_pitch                 0
##   home_run                     0
##   other_out                    0
##   pickoff_2b                   0
##   sac_bunt                     0
##   sac_fly                      0
##   single                       0
##   strikeout                    0
##   strikeout_double_play        0
##   triple                       0
##   walk                        22
## 
## Overall Statistics
##                                           
##                Accuracy : 0.877           
##                  95% CI : (0.8351, 0.9115)
##     No Information Rate : 0.4078          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8321          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: caught_stealing_2b Class: double
## Sensitivity                                 NA       0.47368
## Specificity                                  1       0.97931
## Pos Pred Value                              NA       0.60000
## Neg Pred Value                              NA       0.96599
## Prevalence                                   0       0.06149
## Detection Rate                               0       0.02913
## Detection Prevalence                         0       0.04854
## Balanced Accuracy                           NA       0.72650
##                      Class: double_play Class: field_error
## Sensitivity                    0.000000            0.00000
## Specificity                    1.000000            1.00000
## Pos Pred Value                      NaN                NaN
## Neg Pred Value                 0.996764            0.98706
## Prevalence                     0.003236            0.01294
## Detection Rate                 0.000000            0.00000
## Detection Prevalence           0.000000            0.00000
## Balanced Accuracy              0.500000            0.50000
##                      Class: field_out Class: fielders_choice
## Sensitivity                    1.0000                     NA
## Specificity                    0.8798                      1
## Pos Pred Value                 0.8514                     NA
## Neg Pred Value                 1.0000                     NA
## Prevalence                     0.4078                      0
## Detection Rate                 0.4078                      0
## Detection Prevalence           0.4790                      0
## Balanced Accuracy              0.9399                     NA
##                      Class: fielders_choice_out Class: force_out
## Sensitivity                            0.000000          0.00000
## Specificity                            1.000000          1.00000
## Pos Pred Value                              NaN              NaN
## Neg Pred Value                         0.996764          0.98058
## Prevalence                             0.003236          0.01942
## Detection Rate                         0.000000          0.00000
## Detection Prevalence                   0.000000          0.00000
## Balanced Accuracy                      0.500000          0.50000
##                      Class: grounded_into_double_play Class: hit_by_pitch
## Sensitivity                                  0.200000            1.000000
## Specificity                                  1.000000            1.000000
## Pos Pred Value                               1.000000            1.000000
## Neg Pred Value                               0.987013            1.000000
## Prevalence                                   0.016181            0.009709
## Detection Rate                               0.003236            0.009709
## Detection Prevalence                         0.003236            0.009709
## Balanced Accuracy                            0.600000            1.000000
##                      Class: home_run Class: other_out Class: pickoff_2b
## Sensitivity                  0.81818               NA                NA
## Specificity                  0.99329                1                 1
## Pos Pred Value               0.81818               NA                NA
## Neg Pred Value               0.99329               NA                NA
## Prevalence                   0.03560                0                 0
## Detection Rate               0.02913                0                 0
## Detection Prevalence         0.03560                0                 0
## Balanced Accuracy            0.90574               NA                NA
##                      Class: sac_bunt Class: sac_fly Class: single
## Sensitivity                       NA       0.000000        0.8864
## Specificity                        1       1.000000        0.9698
## Pos Pred Value                    NA            NaN        0.8298
## Neg Pred Value                    NA       0.990291        0.9809
## Prevalence                         0       0.009709        0.1424
## Detection Rate                     0       0.000000        0.1262
## Detection Prevalence               0       0.000000        0.1521
## Balanced Accuracy                 NA       0.500000        0.9281
##                      Class: strikeout Class: strikeout_double_play
## Sensitivity                    0.9841                           NA
## Specificity                    1.0000                            1
## Pos Pred Value                 1.0000                           NA
## Neg Pred Value                 0.9960                           NA
## Prevalence                     0.2039                            0
## Detection Rate                 0.2006                            0
## Detection Prevalence           0.2006                            0
## Balanced Accuracy              0.9921                           NA
##                      Class: triple Class: walk
## Sensitivity               0.000000      1.0000
## Specificity               1.000000      1.0000
## Pos Pred Value                 NaN      1.0000
## Neg Pred Value            0.996764      1.0000
## Prevalence                0.003236      0.0712
## Detection Rate            0.000000      0.0712
## Detection Prevalence      0.000000      0.0712
## Balanced Accuracy         0.500000      1.0000

85% accuracy is a lot better. By adding more predictors we were able to drastically improve our accuracy.